************************************************************************
* This file uses worker-data on wages and occupational layers to 
* construct firm hierarchies and within-firm inequality measures. 
* It then adds firm characteristics from the business register (FIRM)
************************************************************************

clear all
set more off

cd "${data_dir}"
use workerdata_layer.dta, clear

* private firms
keep if sektor == "2"
cap drop CPI

gen firstdigit = substr(funk,1,1)
gen manager = firstdigit == "1"

* Add firm data from Business Register
*******************************************
merge n:1 aar cvrnr using GF_firmdata.dta
drop if _merge == 2
drop _merge

destring aar, replace

* number of plants:
bys aar cvrnr lbnr: gen countplant = _n==1
* set plant counter to ZERO if no physical plant ID
replace countplant = 0 if lbnr == notphysical
by aar cvrnr: egen plants = total(countplant)

* CPI deflator
*******************
tostring aar, replace
gen CPI = 0
* base year is 2000
replace CPI = 4353/5253 if aar == "1991"
replace CPI = 4445/5253 if aar == "1992"
replace CPI = 4500/5253 if aar == "1993"
replace CPI = 4590/5253 if aar == "1994"
replace CPI = 4686/5253 if aar == "1995"
replace CPI = 4785/5253 if aar == "1996"
replace CPI = 4890/5253 if aar == "1997"
replace CPI = 4980/5253 if aar == "1998"
replace CPI = 5104/5253 if aar == "1999"
replace CPI = 5253/5253 if aar == "2000"
replace CPI = 5377/5253 if aar == "2001"
replace CPI = 5507/5253 if aar == "2002"
replace CPI = 5622/5253 if aar == "2003"
replace CPI = 5687/5253 if aar == "2004"
replace CPI = 5790/5253 if aar == "2005"
replace CPI = 5900/5253 if aar == "2006"
replace CPI = 6001/5253 if aar == "2007"
replace CPI = 6205/5253 if aar == "2008"
replace CPI = 6287/5253 if aar == "2009"
replace CPI = 6432/5253 if aar == "2010"

replace joblon = joblon/CPI
replace timelon = timelon/CPI
replace sales = sales/CPI
replace va = va/CPI

* Average Wage per Firm
cap drop totalh
bys aar cvrnr: egen totalh = total(hours)
by aar cvrnr: egen totalw = total(joblon)
gen totalnw = totalw/totalh

* layers data for firms (Caliendo et al)
by aar cvrnr, sort: egen minlayer = min(layer)
set more off
gen layerN = 1 if layer == minlayer
foreach lay of numlist 2/4 {
   gen layerright = layer if layer>minlayer & minlayer!=.
   drop minlayer
   by aar cvrnr: egen minlayer = min(layerright) 
   replace layerN = `lay' if layer == minlayer
   drop layerright
}
drop minlayer
bys aar cvrnr: egen layers = max(layerN)
replace layers = . if implayershare2 > 0.2

* Composition of occupational layers
quietly tab layer, gen(occlayerdummy)
set more off
sort aar cvrnr
forvalues z = 1/4 {
by aar cvrnr: egen occlayerind`z' = max(occlayerdummy`z')
}

* Drop outliers in hourly wages by year: lowest and highest 0.5%
bys aar: egen p025wage = pctile(timelon), p(0.25)
bys aar: egen p9975wage = pctile(timelon), p(99.75)
drop if timelon < p025wage | timelon > p9975wage
replace timelon = . if timelon == 0
drop if timelon == .

* skill composition:
* 1) share of college educated workers
gen collegeplus = (educ == 4 | educ ==5)
sort cvrnr aar
by cvrnr aar: egen collegeshare = mean(collegeplus)

* 2) average schooling, ignore missing
sum schooling
replace schooling = schooling / 12
tab aar if schooling == .
by cvrnr aar: egen avschool = mean(schooling)
by cvrnr aar: egen sdschool = sd(schooling)

* 3) average experience 
replace experience = experience / 1000
by cvrnr aar: egen avexperience = mean(experience)
by cvrnr aar: egen sdexperience = sd(experience)

* span of control: employment in top layer relative to all other layers 
by cvrnr aar: egen maxlayer = max(layer)
foreach var in counter{

gen top`var' = `var' if layer == maxlayer 
gen bottom`var' = `var' if layer != maxlayer

by cvrnr aar: egen totaltop`var' = total(top`var')
by cvrnr aar: egen totalbottom`var' = total(bottom`var')
}

* If I take the ratio, all firms with one layer drop out.
* Instead I take total over top layer, defined as 1 for 1-layer firms
gen span = (totaltopcounter + totalbottomcounter) / totaltopcounter

* Alternative layers measures:
* 1) Combine the two bottom layers into one 

gen layer_3max = 1 if layer == 1 | layer == 2
replace layer_3max = 2 if layer ==3
replace layer_3max = 3 if layer ==4

by cvrnr aar: egen minlayer = min(layer_3max)
set more off
gen layerN = 1 if layer_3max == minlayer
foreach lay of numlist 2/3 {
   gen layerright = layer_3max if layer_3max>minlayer & minlayer!=.
   drop minlayer
   by cvrnr aar: egen minlayer = min(layerright) 
   replace layerN = `lay' if layer_3max == minlayer
   drop layerright
}
drop minlayer 
by cvrnr aar: egen layers_3max = max(layerN)
drop layerN
* No need to worry amout missing occupations, the layer variable is set to
* missing for those firms already.


* 2) 3 layers: top and middle management, all others
gen threedigits = substr(funk,1,3)
destring threedigits, replace force

cap drop altlayer
gen altlayer = 3 if layer ==4 
replace altlayer = 2 if layer == 3 & (threedigits < 200)
replace altlayer = 1 if layer != . & altlayer == .

by cvrnr aar: egen minlayer = min(altlayer)
set more off
gen layerN = 1 if altlayer == minlayer
foreach lay of numlist 2/3 {
   gen layerright = altlayer if altlayer>minlayer & minlayer!=.
   drop minlayer
   by cvrnr aar: egen minlayer = min(layerright) 
   replace layerN = `lay' if altlayer == minlayer
   drop layerright
}
drop minlayer
by cvrnr aar: egen layers_alt = max(layerN)
drop layerN

replace layers_alt = . if aar < 1999
replace layers_alt = . if layers == .

replace layers_3max = . if aar < 1999
replace layers_3max = . if layers == .

* Measures of inequality
******************************************************

* Indicators for preexisting workforce:
destring aar, gen(year)
sort cvrnr pnr year
by cvrnr pnr: gen preexisting = timelon[_n-1] != . & year[_n-1] == year-1
by cvrnr pnr: gen nextexisting = timelon[_n+1] != . & year[_n+1] == year+1
by cvrnr pnr: gen pre2existing = timelon[_n-1] != . & year[_n-1] == year-1 & ///
timelon[_n-2] != . & year[_n-2]==year -2
by cvrnr pnr: gen next2existing = timelon[_n+1] != . & year[_n+1] == year+1 & ///
timelon[_n+2] != . & year[_n+2]==year+2

* use log wages
replace timelon = log(timelon)
by cvrnr pnr: gen wagegrowth = timelon - timelon[_n-1] if year == year[_n-1]+1

* Individual level: Wage growth for managers:
gen managerwagegrowth = wagegrowth * manager
gen workerwagegrowth = wagegrowth * (1-manager)

bys aar cvrnr: egen loginequality1 = sd(timelon)
by aar cvrnr: egen logpc10 = pctile(timelon), p(10)
by aar cvrnr: egen logpc25 = pctile(timelon), p(25)
by aar cvrnr: egen logpc50 = pctile(timelon), p(50)
by aar cvrnr: egen logpc75 = pctile(timelon), p(75)
by aar cvrnr: egen logpc90 = pctile(timelon), p(90)
by aar cvrnr: egen logpc99 = pctile(timelon), p(99)
gen loginequality2 = logpc90 - logpc10
gen loginequality3 = logpc75 - logpc25
gen loginequality4 = logpc90 - logpc50
gen loginequality5 = logpc50 - logpc10
gen loginequality6 = logpc99 - logpc50
drop logpc*

by aar cvrnr: egen totalpre = total(preexisting)
by aar cvrnr: egen totalnext = total(nextexisting)
set more off
by aar cvrnr: egen help_logpreinequality1 = sd(timelon) if preexisting == 1
by aar cvrnr: egen help_lognextinequality1 = sd(timelon) if nextexisting == 1
by aar cvrnr: egen help_logpre2inequality1 = sd(timelon) if pre2existing == 1
by aar cvrnr: egen help_lognext2inequality1 = sd(timelon) if next2existing == 1

by aar cvrnr: egen logpreinequality1 = max(help_logpreinequality1)
by aar cvrnr: egen lognextinequality1 = max(help_lognextinequality1)
by aar cvrnr: egen logpre2inequality1 = max(help_logpre2inequality1)
by aar cvrnr: egen lognext2inequality1 = max(help_lognext2inequality1)

* average wage growth for stayers: managers vs workers:
foreach var in manager worker {
by aar cvrnr: egen mean`var'wagegrowth = mean(`var'wagegrowth)
}

foreach x in 10 25 50 75 90 {
by aar cvrnr: egen logprepc`x'help = pctile(timelon) if preexisting == 1, p(`x')
by aar cvrnr: egen logprepc`x' = max(logprepc`x'help)
by aar cvrnr: egen lognextpc`x'help = pctile(timelon) if nextexisting == 1, p(`x')
by aar cvrnr: egen lognextpc`x' = max(lognextpc`x'help)
by aar cvrnr: egen logpre2pc`x'help = pctile(timelon) if pre2existing == 1, p(`x')
by aar cvrnr: egen logpre2pc`x' = max(logpre2pc`x'help)
by aar cvrnr: egen lognext2pc`x'help = pctile(timelon) if next2existing == 1, p(`x')
by aar cvrnr: egen lognext2pc`x' = max(lognext2pc`x'help)
}

gen logpreinequality2 = logprepc90 - logprepc10
gen logpreinequality3 = logprepc75 - logprepc25
gen logpreinequality4 = logprepc90 - logprepc50
gen logpreinequality5 = logprepc50 - logprepc10
gen lognextinequality2 = lognextpc90 - lognextpc10
gen lognextinequality3 = lognextpc75 - lognextpc25
gen lognextinequality4 = lognextpc90 - lognextpc50
gen lognextinequality5 = lognextpc50 - lognextpc10

gen logpre2inequality2 = logpre2pc90 - logpre2pc10
gen logpre2inequality3 = logpre2pc75 - logpre2pc25
gen logpre2inequality4 = logpre2pc90 - logpre2pc50
gen logpre2inequality5 = logpre2pc50 - logpre2pc10
gen lognext2inequality2 = lognext2pc90 - lognext2pc10
gen lognext2inequality3 = lognext2pc75 - lognext2pc25
gen lognext2inequality4 = lognext2pc90 - lognext2pc50
gen lognext2inequality5 = lognext2pc50 - lognext2pc10
drop lognextpc* logprepc* lognext2pc* logpre2pc*

* Wage inequality accounting for worker observables:

* Run an OLS regression for log hourly wages,
* taking out differences across workers due to observables
gen age2 = age^2
gen experience2 = experience^2
gen tenure2 = tenure^2

cap drop year
cap destring aar, gen(year)

* Baseline
reg timelon age age2 experience experience2 i.educ i.gender
outreg2 using TableA2.tex, tex alpha(0.01, 0.05, 0.1) symbol(***,**,*) bdec(4) replace
predict wageprojection0, xb
predict wageres0, res

* Add time effects
reg timelon age age2 experience experience2 i.educ i.gender i.year
outreg2 using TableA2.tex, tex alpha(0.01, 0.05, 0.1) symbol(***,**,*) bdec(4) append
predict wageprojection1, xb
predict wageres1, res

* Add tenure and gender
reg timelon age age2 experience experience2 tenure tenure2 i.educ i.gender i.year
outreg2 using TableA2.tex, tex alpha(0.01, 0.05, 0.1) symbol(***,**,*) bdec(4) append
predict wageprojection2, xb
predict wageres2, res

* Add occupation FE
quietly tab educ, gen(educdummy)
gen female = gender == 2
quietly tab year, gen(yeardummy)
gen threedigits = substr(funk,1,3)
egen occupationcode = group(threedigits)
areg timelon age age2 experience experience2 tenure tenure2 educdummy2-educdummy5 female yeardummy*, absorb(occupationcode)
outreg2 using TableA2.tex, tex alpha(0.01, 0.05, 0.1) symbol(***,**,*) bdec(4) append
predict wageprojection3, xb
predict wageres3, res

drop educdummy* 
drop yeardummy*

* Add worker FE
egen personcode = group(pnr)
areg timelon age age2 experience experience2 tenure tenure2 educdummy2-educdummy5 female yeardummy*, absorb(personcode)
outreg2 using TableA2.tex, tex alpha(0.01, 0.05, 0.1) symbol(***,**,*) bdec(4) append
predict wageprojection4, xb
predict wageres4, res

* Save a panel data set of wage residuals for variance decomposition:
preserve
keep aar pnr cvrnr wageres* wageprojection*
save wage_residuals.dta, replace 
restore

* Focus on two different specifications: (wageres2) and (wageres4)
***********************************************************************

set more off
foreach x in 2 4 {
sort cvrnr pnr aar
by cvrnr pnr: gen reswagegrowth`x' = wageres`x' - wageres`x'[_n-1] if year == year[_n-1]+1

* Standard deviation of residual wages:
bys aar cvrnr: egen resinequality1_model`x' = sd(wageres`x')
by aar cvrnr: egen respreinequality1_model`x' = sd(wageres`x') if preexisting == 1
by aar cvrnr: egen resnextinequality1_model`x' = sd(wageres`x') if nextexisting == 1
by aar cvrnr: egen respre2inequality1_model`x' = sd(wageres`x') if pre2existing == 1
by aar cvrnr: egen resnext2inequality1_model`x' = sd(wageres`x') if next2existing == 1

* Wage gaps:
by aar cvrnr: egen logpc10 = pctile(wageres`x'), p(10)
by aar cvrnr: egen logpc25 = pctile(wageres`x'), p(25)
by aar cvrnr: egen logpc50 = pctile(wageres`x'), p(50)
by aar cvrnr: egen logpc75 = pctile(wageres`x'), p(75)
by aar cvrnr: egen logpc90 = pctile(wageres`x'), p(90)
by aar cvrnr: egen logpc99 = pctile(wageres`x'), p(99)
gen resinequality2_model`x' = logpc90 - logpc10
gen resinequality3_model`x' = logpc75 - logpc25
gen resinequality4_model`x' = logpc90 - logpc50
gen resinequality5_model`x' = logpc50 - logpc10
gen resinequality6_model`x' = logpc99 - logpc50
drop logpc*

foreach y in 10 25 50 75 90{
by aar cvrnr: egen logprepc`y'help = pctile(wageres`x') if preexisting == 1, p(`y')
by aar cvrnr: egen logprepc`y' = max(logprepc`y'help)
by aar cvrnr: egen logpre2pc`y'help = pctile(wageres`x') if pre2existing == 1, p(`y')
by aar cvrnr: egen logpre2pc`y' = max(logpre2pc`y'help)
by aar cvrnr: egen lognextpc`y'help = pctile(wageres`x') if nextexisting == 1, p(`y')
by aar cvrnr: egen lognextpc`y' = max(lognextpc`y'help)
by aar cvrnr: egen lognext2pc`y'help = pctile(wageres`x') if next2existing == 1, p(`y')
by aar cvrnr: egen lognext2pc`y' = max(lognext2pc`y'help)
}
gen respreinequality2_model`x' = logprepc90 - logprepc10
gen respreinequality3_model`x' = logprepc75 - logprepc25
gen respreinequality4_model`x' = logprepc90 - logprepc50
gen respreinequality5_model`x' = logprepc50 - logprepc10
gen respre2inequality2_model`x' = logpre2pc90 - logpre2pc10
gen respre2inequality3_model`x' = logpre2pc75 - logpre2pc25
gen respre2inequality4_model`x' = logpre2pc90 - logpre2pc50
gen respre2inequality5_model`x' = logpre2pc50 - logpre2pc10

gen resnextinequality2_model`x' = lognextpc90 - lognextpc10
gen resnextinequality3_model`x' = lognextpc75 - lognextpc25
gen resnextinequality4_model`x' = lognextpc90 - lognextpc50
gen resnextinequality5_model`x' = lognextpc50 - lognextpc10
gen resnext2inequality2_model`x' = lognext2pc90 - lognext2pc10
gen resnext2inequality3_model`x' = lognext2pc75 - lognext2pc25
gen resnext2inequality4_model`x' = lognext2pc90 - lognext2pc50
gen resnext2inequality5_model`x' = lognext2pc50 - lognext2pc10
drop lognextpc* logprepc* lognext2pc* logpre2pc*


* Managers and Workers
gen managerreswagegrowth`x' = reswagegrowth`x' * manager
gen workerreswagegrowth`x' = reswagegrowth`x' * (1-manager)
* average residual wage growth for stayers: managers vs workers:
foreach var in manager worker {
by aar cvrnr: egen mean`var'reswagegrowth`x' = mean(`var'reswagegrowth`x')
}

}

* top-bottom pay gap:
sort cvrnr aar
cap drop maxlayer
by cvrnr aar: egen maxlayer = max(layer)
cap drop minlayer
by cvrnr aar: egen minlayer = min(layer)

replace timelon = exp(timelon) 
gen logw = log(timelon)

* log(wages) for preexisting workforce:
sort cvrnr pnr aar
by cvrnr pnr: gen stayers = logw if logw[_n-1] != . & aar[_n-1] == aar-1
by cvrnr pnr: gen nextstayers = logw if logw[_n+1] != . & aar[_n+1] == aar+1

foreach var in timelon logw stayers nextstayers wageres2 wageres4 {
gen top`var' = `var' if layer == maxlayer 
gen bottom`var' = `var' if layer == minlayer

bys cvrnr aar: egen meantop`var' = mean(top`var')
by cvrnr aar: egen meanbottom`var' = mean(bottom`var')

by cvrnr aar: egen mediantop`var' = median(top`var')
by cvrnr aar: egen medianbottom`var' = median(bottom`var')
}

* Level of observation: Firms
by aar cvrnr: keep if _n==1

keep aar cvrnr fixedassets investment sales va empl* nworkers nmanagers ///
sector009 sector027 sector053 layers* span plants ///
occlayerind* totalw totalnw totalh* ///
loginequality* logpreinequality* lognextinequality* ///
logpre2inequality* lognext2inequality* resinequality* ///
respreinequality* respre2inequality* resnextinequality* resnext2inequality* ///
meanmanager* meanworker* meantop* mediantop* meanbottom* medianbottom*

gen wagegap1_topbottom_log = meantoplogw - meanbottomlogw
gen wagegap1_topbottom_res2 = meantopwageres2 - meanbottomwageres2
gen wagegap1_topbottom_res4 = meantopwageres4 - meanbottomwageres4
gen wagegap1_topbottom_stay = meantopstayers - meanbottomstayers
gen wagegap1_topbottom_nextstay = meantopnextstayers - meanbottomnextstayers

gen wagegap3_topbottom_log = mediantoplogw - medianbottomlogw
gen wagegap3_topbottom_res2 = mediantopwageres2 - medianbottomwageres2
gen wagegap3_topbottom_res4 = mediantopwageres4 - medianbottomwageres4
gen wagegap3_topbottom_stay = mediantopstayers - medianbottomstayers
gen wagegap3_topbottom_nextstay = mediantopnextstayers - medianbottomnextstayers

sort cvrnr aar
by cvrnr: gen dgap1_log = wagegap1_topbottom_log - wagegap1_topbottom_log[_n-1] if aar-aar[_n-1]==1
by cvrnr: gen dgap1_res2 = wagegap1_topbottom_res2 - wagegap1_topbottom_res2[_n-1] if aar-aar[_n-1]==1
by cvrnr: gen dgap1_res4 = wagegap1_topbottom_res4 - wagegap1_topbottom_res4[_n-1] if aar-aar[_n-1]==1
by cvrnr: gen dgap1_stay = wagegap1_topbottom_stay - wagegap1_topbottom_nextstay[_n-1] if aar-aar[_n-1]==1

by cvrnr: gen dgap3_log = wagegap3_topbottom_log - wagegap3_topbottom_log[_n-1] if aar-aar[_n-1]==1
by cvrnr: gen dgap3_res2 = wagegap3_topbottom_res2 - wagegap3_topbottom_res2[_n-1] if aar-aar[_n-1]==1
by cvrnr: gen dgap3_res4 = wagegap3_topbottom_res4 - wagegap3_topbottom_res4[_n-1] if aar-aar[_n-1]==1
by cvrnr: gen dgap3_stay = wagegap3_topbottom_stay - wagegap3_topbottom_nextstay[_n-1] if aar-aar[_n-1]==1

foreach xx in inequality1 inequality2 inequality3 inequality4 inequality5 inequality6 ///
preinequality1 preinequality2 preinequality3 preinequality4 preinequality5 ///
nextinequality1 nextinequality2 nextinequality3 nextinequality4 nextinequality5{
cap drop log2`xx'
gen log2`xx' = log(log`xx')
}

foreach xx in inequality1 inequality2 inequality3 inequality4 inequality5 inequality6 ///
preinequality1 preinequality2 preinequality3 preinequality4 preinequality5 ///
nextinequality1 nextinequality2 nextinequality3 nextinequality4 nextinequality5{
forvalues yy = 2/4 {
cap drop res2`xx'_model`yy'
gen res2`xx'_model`yy' = log(res`xx'_model`yy')
}
}

sort cvrnr aar
destring aar, replace
foreach xx in inequality {
forvalues yy = 1/6{
by cvrnr: gen dlog2`xx'`yy' = log2`xx'`yy' - log2`xx'`yy'[_n-1] if aar-aar[_n-1]==1
foreach z in 2 4{
by cvrnr: gen dres2`xx'`yy'_model`z' = res2`xx'`yy'_model`z' - res2`xx'`yy'_model`z'[_n-1] if aar-aar[_n-1]==1
}
}
}

sort cvrnr aar
forvalues y = 1/5{
by cvrnr: gen dlog2preinequality`y' = log2preinequality`y' - log2nextinequality`y'[_n-1] if aar-aar[_n-1]==1
}
forvalues y = 1/5{
by cvrnr: gen dres2preinequality`y' = res2preinequality`y'_model4 - res2nextinequality`y'_model4[_n-1] if aar-aar[_n-1]==1
}

sort cvrnr aar
destring aar, replace
by cvrnr: gen gap = aar - aar[_n-1]

by cvrnr: gen droplayers = layers < layers[_n-1] if gap == 1 & layers != . & layers[_n-1] != .
by cvrnr: gen addlayers = layers > layers[_n-1] if gap == 1 & layers != . & layers[_n-1] != .
by cvrnr: gen samelayers = layers == layers[_n-1] if gap == 1 & layers != . & layers[_n-1] != .
by cvrnr: gen layerchanges = layers - layers[_n-1] if gap == 1 & layers != . & layers[_n-1] != .
by cvrnr: gen laglayers = layers[_n-1] if layers[_n-1] != .

gen adddrop = 1 if layerchanges > 0 & layerchanges != .
replace adddrop = -1 if layerchanges < 0 & layerchanges != .
replace adddrop = 0 if layerchanges == 0 & layerchanges != .
by cvrnr: gen nextadddrop = adddrop[_n+1]


set more off
by cvrnr: gen lagsales = sales[_n-1] if gap == 1
by cvrnr: gen dsales = log(sales) - log(sales[_n-1]) if gap == 1
by cvrnr: gen dva = log(va) - log(va[_n-1]) if gap == 1
by cvrnr: gen dhours = log(totalh) - log(totalh[_n-1]) if gap == 1
by cvrnr: gen dwages = log(totalnw) - log(totalnw[_n-1]) if gap == 1

xtset firmcode aar
gen dcollege = D.collegeshare
gen dexperience = D.avexperience
gen dschooling = D.avschool

gen dsdexperience = D.sdexperience
gen dsdschooling = D.sdschool

gen logva = log(va)
gen logempl = log(empl2)

tostring aar, replace
sort aar cvrnr

save firm_characteristics.dta, replace

